PCA: an orthogonal linear transformation which projects the genotype data into the new reduced dimensional space such that the greater variance comes in an order
UMAP: a novel non-linear dimensionality reduction technique based on Riemannian geometry and algebraic topology to model and preserve the high-dimensional topology of data points in the lowdimensional space
PCA–UMAP: an application of UMAP for principal components of genotype data to be computationally more advantageous and statistically less noisy
## Haplogroup assignments
dat <- generate_snp_data_fixed("/Users/sheaandrews/GitCode/MitoImpute/ReferencePanel/ReferencePanel.map",
"/Users/sheaandrews/GitCode/MitoImpute/ReferencePanel/ReferencePanel.ped")
data(nodes)
classifications.raw <- HiMC::getClassifications(dat)
classifications <- as_tibble(classifications.raw[-nrow(classifications.raw),]) #remove Mitochondria_Information row
ped <- dat
colnames(ped) <- c("Family", "Individual", "Father", "Mother",
"Sex", "Phenotype", paste0('mt', colnames(ped[, 7:ncol(ped)])))
ped <- as_tibble(ped) %>%
na_if(., "0 0") %>%
mutate_at(vars(starts_with("mt")), as_factor) %>%
mutate_at(vars(starts_with("mt")), as.numeric) %>%
left_join(classifications) %>%
mutate(macro = ifelse(str_detect(haplogroup, "^L"),
substr(haplogroup, start = 1, stop = 2),
substr(haplogroup, start = 1, stop = 1))) %>%
select(Family, Individual, Father, Mother, Sex, Phenotype, haplogroup, macro, everything())
Run PCA analysis of mtReference Panel
## PCA
res.pca <- ped %>%
select(starts_with("mt")) %>%
PCA(., scale.unit = T, ncp = 10, graph = FALSE)
ind.pca <- get_pca_ind(res.pca)$coord %>%
as_tibble() %>%
bind_cols(select(ped, Individual, haplogroup, macro)) %>%
select(Individual, haplogroup, macro, everything()) %>%
filter(macro %nin% c("u")) %>%
filter(!is.na(macro))
## Count number of mthg
ind.pca %>% count(macro) %>% print(n = Inf)
## # A tibble: 19 x 2
## macro n
## <chr> <int>
## 1 A 1176
## 2 B 143
## 3 C 1466
## 4 D 2073
## 5 H 7651
## 6 I 549
## 7 J 1733
## 8 K 1479
## 9 L0 1026
## 10 L1 837
## 11 L2 1252
## 12 L3 2251
## 13 M 5726
## 14 N 1230
## 15 R 2444
## 16 T 1607
## 17 U 3353
## 18 V 525
## 19 W 433
ggplot(ind.pca, aes(x = Dim.1, y = Dim.2, colour = macro)) + geom_point() + theme_bw()
plot_ly(ind.pca, x = ~Dim.2, y = ~Dim.1, z = ~Dim.3, color = ~macro, type = 'scatter3d', mode = 'markers', marker = list(size = 3),
hoverinfo = 'text', text = ~paste('</br> ID: ', Individual,
'</br> Haplogroup: ', macro,
'</br> PC1: ', round(Dim.1, 2),
'</br> PC2: ', round(Dim.2, 2),
'</br> PC3: ', round(Dim.3, 2)))
## PCA
oa.pca <- ped %>%
filter(macro %nin% c("L0", "L1", "L2", "L3")) %>%
select(starts_with("mt")) %>%
PCA(., scale.unit = T, ncp = 10, graph = FALSE)
oa.ind.pca <- get_pca_ind(oa.pca)$coord %>%
as_tibble() %>%
bind_cols(select(filter(ped, macro %nin% c("L0", "L1", "L2", "L3")), Individual, haplogroup, macro)) %>%
select(Individual, haplogroup, macro, everything()) %>%
filter(macro %nin% c("u")) %>%
filter(!is.na(macro))
ggplot(oa.ind.pca, aes(x = Dim.1, y = Dim.2, colour = macro)) + geom_point() + theme_bw() +
scale_color_manual(values = cols25())
plot_ly(oa.ind.pca, x = ~Dim.2, y = ~Dim.1, z = ~Dim.3, color = ~macro,
type = 'scatter3d', mode = 'markers', marker = list(size = 3),
hoverinfo = 'text', text = ~paste('</br> ID: ', Individual,
'</br> Haplogroup: ', macro,
'</br> PC1: ', round(Dim.1, 2),
'</br> PC2: ', round(Dim.2, 2),
'</br> PC3: ', round(Dim.3, 2)))
## Haplogroup assignments
dat_1kg <- generate_snp_data_fixed(
"/Users/sheaandrews/GitCode/MitoImputePrep/DerivedData/ThousandGenomes/chrMT_1kg_norm_decomposed_firstAlt.map",
"/Users/sheaandrews/GitCode/MitoImputePrep/DerivedData/ThousandGenomes/chrMT_1kg_norm_decomposed_firstAlt.ped")
info_1kg <- read_tsv("/Users/sheaandrews/GitCode/MitoImputePrep/data/ThousandGenomes/20130606_g1k.ped", guess_max = 10000) %>%
select(Individual = `Individual ID`, Family_ID = `Family ID`, Father = `Paternal ID`, Mother = `Maternal ID`, Population, Gender)
data(nodes)
classifications_1kg.raw <- HiMC::getClassifications(dat_1kg)
classifications_1kg <- as_tibble(classifications_1kg.raw[-nrow(classifications_1kg.raw),]) #remove Mitochondria_Information row
ped_1kg <- dat_1kg
colnames(ped_1kg) <- c("Family", "Individual", "Father", "Mother",
"Sex", "Phenotype", paste0('mt', colnames(ped_1kg[, 7:ncol(ped_1kg)])))
ped_1kg <- as_tibble(ped_1kg) %>%
na_if(., "0 0") %>%
mutate_at(vars(starts_with("mt")), as_factor) %>%
mutate_at(vars(starts_with("mt")), as.numeric) %>%
left_join(classifications_1kg) %>%
mutate(macro = ifelse(str_detect(haplogroup, "^L"),
substr(haplogroup, start = 1, stop = 2),
substr(haplogroup, start = 1, stop = 1))) %>%
select(-Family, -Sex, -Father, -Mother) %>%
left_join(info_1kg, by = "Individual") %>%
select(Family = Family_ID, Individual, Father, Mother, Sex = Gender, Phenotype,
pop = Population, haplogroup, macro, full_path, everything()) %>%
filter(macro %nin% c("u")) %>%
filter(!is.na(macro)) # %>%
# group_by(Family) %>%
# slice(1) %>%
# ungroup()
# filter(macro %nin% c("L0", "L1", "L2", "L3"))
miss_gt <- ped_1kg %>%
select(everything()) %>% # replace to your needs
summarise_all( ~ sum(is.na(.))) %>%
t() %>%
as_tibble(rownames = "var") %>%
filter(V1 > 0) %>% pull(var)
count(ped_1kg, pop) %>% print(n = Inf)
count(ped_1kg, macro) %>% print(n = Inf)
count(ped_1kg, haplogroup) %>% print(n = Inf)
## PCA
res.pca_1kg <- ped_1kg %>%
select(starts_with("mt")) %>%
select(-all_of(miss_gt)) %>%
# mutate_all(., .funs = function(x){ifelse(is.na(x), round(mean(x, na.rm = TRUE)), x)}) %>%
PCA(., scale.unit = F, ncp = 10, graph = FALSE)
ind.pca_1kg <- get_pca_ind(res.pca_1kg)$coord %>%
as_tibble() %>%
bind_cols(select(ped_1kg, Individual, haplogroup, macro, pop)) %>%
select(Individual, haplogroup, macro, everything())
ggplot(ind.pca_1kg, aes(x = Dim.1, y = Dim.2, colour = macro)) +
geom_point() +
scale_color_manual(values = cols25()) +
theme_bw() + theme(legend.position = "bottom")
eig.val_1kg <- get_eigenvalue(res.pca_1kg)
fviz_eig(res.pca_1kg, addlabels = TRUE, ylim = c(0, 50))
plot_ly(ind.pca_1kg, x = ~Dim.2, y = ~Dim.1, z = ~Dim.3, color = ~macro, colors = cols25(),
type = 'scatter3d', mode = 'markers', marker = list(size = 3),
hoverinfo = 'text', text = ~paste('</br> ID: ', Individual,
'</br> Haplogroup: ', macro,
'</br> PC1: ', round(Dim.1, 2),
'</br> PC2: ', round(Dim.2, 2),
'</br> PC3: ', round(Dim.3, 2)))
## UMAP
embedding_mt_2d <- ped_1kg %>%
select(starts_with("mt")) %>%
select(-all_of(miss_gt)) %>%
# mutate_all(., .funs = function(x){ifelse(is.na(x), round(mean(x, na.rm = TRUE)), x)}) %>%
umapr::umap(., min_dist = 0.5, n_neighbor = 30, n_components = 2) %>%
as.tibble() %>%
bind_cols(select(ped_1kg, Individual, haplogroup, macro))
## [1] "umap-learn already installed"
embedding_mt_3d <- ped_1kg %>%
select(starts_with("mt")) %>%
select(-all_of(miss_gt)) %>%
# mutate_all(., .funs = function(x){ifelse(is.na(x), round(mean(x, na.rm = TRUE)), x)}) %>%
umapr::umap(., min_dist = 0.5, n_neighbor = 30, n_components = 3) %>%
as.tibble() %>%
bind_cols(select(ped_1kg, Individual, haplogroup, macro))
## [1] "umap-learn already installed"
## PCA+UMAP
embedding_pca_2d <- select(ind.pca_1kg, starts_with("Dim")) %>%
umapr::umap(., min_dist = 0.5, n_neighbor = 30, n_components = 2) %>%
as.tibble() %>%
bind_cols(select(ped_1kg, Individual, haplogroup, macro))
## [1] "umap-learn already installed"
embedding_pca_3d <- select(ind.pca_1kg, starts_with("Dim")) %>%
umapr::umap(., min_dist = 0.5, n_neighbor = 30, n_components = 3) %>%
as.tibble() %>%
bind_cols(select(ped_1kg, Individual, haplogroup, macro))
## [1] "umap-learn already installed"
ggpubr::ggarrange(
ggplot(embedding_mt_2d, aes(x = UMAP1, y = UMAP2, color = macro)) +
geom_point() + scale_color_manual(values = cols25()) + theme_bw() + labs(title = "UMAP"),
ggplot(embedding_pca_2d, aes(x = UMAP1, y = UMAP2, color = macro)) +
geom_point() + scale_color_manual(values = cols25()) + theme_bw() + labs(title = "PCA+UMAP"),
common.legend = TRUE, legend = "bottom"
)
plot_ly(embedding_mt_3d, x = ~UMAP2, y = ~UMAP1, z = ~UMAP3, color = ~macro, colors = cols25(),
type = 'scatter3d', mode = 'markers', marker = list(size = 3),
hoverinfo = 'text', text = ~paste('</br> ID: ', Individual,
'</br> Macro: ', macro,
'</br> Haplogroup: ', haplogroup,
'</br> PC1: ', round(UMAP1, 2),
'</br> PC2: ', round(UMAP2, 2),
'</br> PC3: ', round(UMAP3, 2))) %>%
layout(title="UMAP")
plot_ly(embedding_pca_3d, x = ~UMAP2, y = ~UMAP1, z = ~UMAP3, color = ~macro, colors = cols25(),
type = 'scatter3d', mode = 'markers', marker = list(size = 3),
hoverinfo = 'text', text = ~paste('</br> ID: ', Individual,
'</br> Macro: ', macro,
'</br> Haplogroup: ', haplogroup,
'</br> PC1: ', round(UMAP1, 2),
'</br> PC2: ', round(UMAP2, 2),
'</br> PC3: ', round(UMAP3, 2))) %>%
layout(title="PCA+UMAP")